In [1]:
import pandas as pd
In [2]:
df = pd.read_csv('city.csv', sep=';')
df
Out[2]:
ID Name CountryCode District Population
0 1 Kabul AFG Kabol 1780000
1 2 Qandahar AFG Qandahar 237500
2 3 Herat AFG Herat 186800
3 4 Mazar-e-Sharif AFG Balkh 127800
4 5 Amsterdam NLD Noord-Holland 731200
... ... ... ... ... ...
4074 4075 Khan Yunis PSE Khan Yunis 123175
4075 4076 Hebron PSE Hebron 119401
4076 4077 Jabaliya PSE North Gaza 113901
4077 4078 Nablus PSE Nablus 100231
4078 4079 Rafah PSE Rafah 92020

4079 rows × 5 columns

In [3]:
df.groupby('CountryCode').sum()
Out[3]:
ID Population
CountryCode
ABW 129 29034
AFG 10 2332100
AGO 290 2561600
AIA 123 1556
ALB 34 270000
... ... ...
YEM 10695 1743700
YUG 14364 2189507
ZAF 32274 15196370
ZMB 22155 2473500
ZWE 24423 2730420

232 rows × 2 columns

In [11]:
df.groupby('CountryCode', as_index=False)[['Population']].max()
Out[11]:
CountryCode Population
0 ABW 29034
1 AFG 1780000
2 AGO 2022000
3 AIA 961
4 ALB 270000
... ... ...
227 YEM 503600
228 YUG 1204000
229 ZAF 2352121
230 ZMB 1317000
231 ZWE 1410000

232 rows × 2 columns

In [12]:
df.filter(items=['ID', 'Name'])
Out[12]:
ID Name
0 1 Kabul
1 2 Qandahar
2 3 Herat
3 4 Mazar-e-Sharif
4 5 Amsterdam
... ... ...
4074 4075 Khan Yunis
4075 4076 Hebron
4076 4077 Jabaliya
4077 4078 Nablus
4078 4079 Rafah

4079 rows × 2 columns

In [13]:
df[ ['ID', 'Name'] ]
Out[13]:
ID Name
0 1 Kabul
1 2 Qandahar
2 3 Herat
3 4 Mazar-e-Sharif
4 5 Amsterdam
... ... ...
4074 4075 Khan Yunis
4075 4076 Hebron
4076 4077 Jabaliya
4077 4078 Nablus
4078 4079 Rafah

4079 rows × 2 columns

In [20]:
# df.groupby('CountryCode', as_index=False).count().sort_values(by='Population', ascending=False).head(8)['Population'].sum()
df.groupby('CountryCode', as_index=False).count().sort_values(by='Population', ascending=False).head(10)
Out[20]:
CountryCode ID Name District Population
38 CHN 363 363 363 363
95 IND 341 341 341 341
216 USA 274 274 274 274
28 BRA 250 250 250 250
104 JPN 248 248 248 248
176 RUS 189 189 189 189
130 MEX 173 173 173 173
163 PHL 136 136 136 136
53 DEU 93 93 93 93
94 IDN 85 85 85 85
In [27]:
(df.groupby('CountryCode', as_index=False)
    .filter(lambda x: len(x['Population']) > 100)
    .groupby('CountryCode', as_index=False)
    .max())
Out[27]:
CountryCode ID Name District Population
0 BRA 455 Águas Lindas de Goiás Tocantins 9968485
1 CHN 2252 Zunyi Zhejiang 9696300
2 IND 1364 Yeotmal (Yavatmal) West Bengali 10500000
3 JPN 1779 Zama Yamanashi 7980230
4 MEX 2687 Zumpango Zacatecas 8591309
5 PHL 900 Zamboanga Western Visayas 2173831
6 RUS 3768 Štšolkovo Yamalin Nenetsia 8389200
7 USA 4066 Yonkers Wisconsin 8008278
In [28]:
df2 = (df.groupby('CountryCode', as_index=False)
    .filter(lambda x: len(x['Population']) > 100)
    .groupby('CountryCode', as_index=False)
    .max())
df2
Out[28]:
CountryCode ID Name District Population
0 BRA 455 Águas Lindas de Goiás Tocantins 9968485
1 CHN 2252 Zunyi Zhejiang 9696300
2 IND 1364 Yeotmal (Yavatmal) West Bengali 10500000
3 JPN 1779 Zama Yamanashi 7980230
4 MEX 2687 Zumpango Zacatecas 8591309
5 PHL 900 Zamboanga Western Visayas 2173831
6 RUS 3768 Štšolkovo Yamalin Nenetsia 8389200
7 USA 4066 Yonkers Wisconsin 8008278
In [30]:
df2[ df2.Population > 5000000 ][['CountryCode', 'Population']]
Out[30]:
CountryCode Population
0 BRA 9968485
1 CHN 9696300
2 IND 10500000
3 JPN 7980230
4 MEX 8591309
6 RUS 8389200
7 USA 8008278
In [ ]:
 
In [42]:
df.groupby('CountryCode')\
    .apply(lambda x: x['Population'].max() if len(x['CountryCode']) > 100 else 0)\
    .to_frame()\
    .sort_values(by=0, ascending=False)\
    .rename(columns={0: 'Population'})\
    .query('Population > 5000000')\
    .reset_index()
Out[42]:
CountryCode Population
0 IND 10500000
1 BRA 9968485
2 CHN 9696300
3 MEX 8591309
4 RUS 8389200
5 USA 8008278
6 JPN 7980230
In [ ]:
 
In [37]:
df
Out[37]:
ID Name CountryCode District Population
0 1 Kabul AFG Kabol 1780000
1 2 Qandahar AFG Qandahar 237500
2 3 Herat AFG Herat 186800
3 4 Mazar-e-Sharif AFG Balkh 127800
4 5 Amsterdam NLD Noord-Holland 731200
... ... ... ... ... ...
4074 4075 Khan Yunis PSE Khan Yunis 123175
4075 4076 Hebron PSE Hebron 119401
4076 4077 Jabaliya PSE North Gaza 113901
4077 4078 Nablus PSE Nablus 100231
4078 4079 Rafah PSE Rafah 92020

4079 rows × 5 columns

In [38]:
df[ df.Population > 8000000 ]
Out[38]:
ID Name CountryCode District Population
205 206 São Paulo BRA São Paulo 9968485
938 939 Jakarta IDN Jakarta Raya 9604900
1023 1024 Mumbai (Bombay) IND Maharashtra 10500000
1889 1890 Shanghai CHN Shanghai 9696300
2330 2331 Seoul KOR Seoul 9981619
2514 2515 Ciudad de México MEX Distrito Federal 8591309
2821 2822 Karachi PAK Sindh 9269265
3356 3357 Istanbul TUR Istanbul 8787958
3579 3580 Moscow RUS Moscow (City) 8389200
3792 3793 New York USA New York 8008278
In [39]:
df.query('Population > 8000000')
Out[39]:
ID Name CountryCode District Population
205 206 São Paulo BRA São Paulo 9968485
938 939 Jakarta IDN Jakarta Raya 9604900
1023 1024 Mumbai (Bombay) IND Maharashtra 10500000
1889 1890 Shanghai CHN Shanghai 9696300
2330 2331 Seoul KOR Seoul 9981619
2514 2515 Ciudad de México MEX Distrito Federal 8591309
2821 2822 Karachi PAK Sindh 9269265
3356 3357 Istanbul TUR Istanbul 8787958
3579 3580 Moscow RUS Moscow (City) 8389200
3792 3793 New York USA New York 8008278
In [ ]: